#!/bin/bash python from gensim.models.ldamulticore import LdaMulticore from gensim import corpora
texts = [] with open('tokenized.txt') as f: for line in f: text = line.strip() texts.append(text.split())
dictionary = corpora.Dictionary(texts) print(dictionary) dictionary.filter_extremes(10, 0.1) print(dictionary) dictionary.save("text.dict") corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('corpora.mm', corpus) corpus = corpora.MmCorpus('corpora.mm') print("Starting Training LDA Model...") lda = LdaMulticore(corpus, num_topics=100, workers=11) print(lda.save("lda.model"))